In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
import joblib
import numpy as np
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif
In [13]:
# Show all columns in the DataFrame
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
In [14]:
df = pd.read_csv('results/combined/Bagged_Ensemble_predictions.csv')
In [15]:
df.head()
Out[15]:
| Protocol | Flow Duration | Tot Fwd Pkts | Tot Bwd Pkts | TotLen Fwd Pkts | TotLen Bwd Pkts | Fwd Pkt Len Max | Fwd Pkt Len Min | Fwd Pkt Len Mean | Fwd Pkt Len Std | Bwd Pkt Len Max | Bwd Pkt Len Min | Bwd Pkt Len Mean | Bwd Pkt Len Std | Flow Byts/s | Flow Pkts/s | Flow IAT Mean | Flow IAT Std | Flow IAT Max | Flow IAT Min | Fwd IAT Tot | Fwd IAT Mean | Fwd IAT Std | Fwd IAT Max | Fwd IAT Min | Bwd IAT Tot | Bwd IAT Mean | Bwd IAT Std | Bwd IAT Max | Bwd IAT Min | Fwd PSH Flags | Bwd PSH Flags | Fwd URG Flags | Bwd URG Flags | Fwd Header Len | Bwd Header Len | Fwd Pkts/s | Bwd Pkts/s | Pkt Len Min | Pkt Len Max | Pkt Len Mean | Pkt Len Std | Pkt Len Var | FIN Flag Cnt | SYN Flag Cnt | RST Flag Cnt | PSH Flag Cnt | ACK Flag Cnt | URG Flag Cnt | CWE Flag Count | ECE Flag Cnt | Down/Up Ratio | Pkt Size Avg | Fwd Seg Size Avg | Bwd Seg Size Avg | Fwd Byts/b Avg | Fwd Pkts/b Avg | Fwd Blk Rate Avg | Bwd Byts/b Avg | Bwd Pkts/b Avg | Bwd Blk Rate Avg | Subflow Fwd Pkts | Subflow Fwd Byts | Subflow Bwd Pkts | Subflow Bwd Byts | Init Fwd Win Byts | Init Bwd Win Byts | Fwd Act Data Pkts | Fwd Seg Size Min | Active Mean | Active Std | Active Max | Active Min | Idle Mean | Idle Std | Idle Max | Idle Min | cm_label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6.0 | 12.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 166666.670 | 12.0 | 0.0 | 12.0 | 12.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 12.0 | 12.0 | 0.0 | 12.0 | 12.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 64.0 | 0.0 | 166666.670 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | -1.0 | 83.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | TN |
| 1 | 6.0 | 12.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 166666.670 | 12.0 | 0.0 | 12.0 | 12.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 12.0 | 12.0 | 0.0 | 12.0 | 12.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 64.0 | 0.0 | 166666.670 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | -1.0 | 83.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | TN |
| 2 | 6.0 | 9.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 222222.220 | 9.0 | 0.0 | 9.0 | 9.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 9.0 | 9.0 | 0.0 | 9.0 | 9.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 64.0 | 0.0 | 222222.220 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | -1.0 | 83.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | TN |
| 3 | 6.0 | 29.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 68965.516 | 29.0 | 0.0 | 29.0 | 29.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 29.0 | 29.0 | 0.0 | 29.0 | 29.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 64.0 | 0.0 | 68965.516 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | -1.0 | 83.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | TN |
| 4 | 6.0 | 50.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 40000.000 | 50.0 | 0.0 | 50.0 | 50.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 50.0 | 50.0 | 0.0 | 50.0 | 50.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 64.0 | 0.0 | 40000.000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | -1.0 | 83.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | TP |
In [16]:
# Tạo từ điển ánh xạ từ cm_label sang tên dễ hiểu hơn
label_mapping = {
'TP': 'Phát hiện đúng MITM',
'TN': 'Phát hiện đúng bình thường',
'FP': 'Báo nhầm là MITM',
'FN': 'Bỏ sót tấn công'
}
# Áp dụng thay thế
df['cm_label'] = df['cm_label'].replace(label_mapping)
In [17]:
label_counts = df['cm_label'].value_counts()
label_percent = df['cm_label'].value_counts(normalize=True) * 100
result = pd.DataFrame({'Count': label_counts, 'Percentage (%)': label_percent.round(2)})
print(result)
Count Percentage (%) cm_label Phát hiện đúng bình thường 24611 88.57 Phát hiện đúng MITM 2310 8.31 Bỏ sót tấn công 807 2.90 Báo nhầm là MITM 58 0.21
In [18]:
import seaborn as sns
import matplotlib.pyplot as plt
for col in df.select_dtypes(include='number').columns:
plt.figure(figsize=(12, 4))
sns.boxplot(y='cm_label', x=col, data=df)
plt.title(f'Boxplot of {col} by Label')
plt.show()
In [19]:
for col in df.select_dtypes(include='number').columns:
plt.figure(figsize=(12, 4))
sns.violinplot(y='cm_label', x=col, data=df)
plt.title(f'Violin plot of {col} by Label')
plt.show()